# This file is part of the replication packet for "A Low-Cost Information Nudge Increases Citizenship Application Rates Among Low-Income Immigrants"

# this script takes the four source files for the ACS and combines them together. 
# It filters the dataset to individuals that would be eligible for naturalization and then identifies individuals that would be likely eligible for the fee waiver and tags them. 
# It outputs the the dataset into a csv file that can be used to create the comparison balance table found in the paper.

# Input
#     psam_husa.csv - 2017 ACS estimates for households (source: https://factfinder.census.gov)
#     psam_husb.csv - 2017 ACS estimates for households (source: https://factfinder.census.gov)
#     psam_pusa.csv - 2017 ACS estimates for persons (source: https://factfinder.census.gov)
#     psam_pusb.csv - 2017 ACS estimates for persons (source: https://factfinder.census.gov)
#     nycPUMAList.csv - a csv file that includes the PUMA numbers for New York City
# Output
#     acs2017Prepped.csv - a CSV file that is used in the script sample_comparison to compare NNY sample to the fee waiver eligible population in NYC, NY, and the USA


# loading needed pacakges -------------------------------------------------

rm(list=ls())

library(data.table)
library(dplyr)
library(statar)
library(xtable)


# setting data location ---------------------------------------------------

### NEED TO CHANGE DATA LOCATIONS

# set data location
acs_data_location <- ""
output_location <- ""
nyc_puma_location <- ""

# change directory to data location
setwd(acs_data_location)


# reading in the data files -----------------------------------------------

# read in the four data files (two household file and two person files)
psam_husa <- fread("psam_husa.csv")
psam_husb <- fread("psam_husb.csv")
psam_pusa <- fread("psam_pusa.csv")
psam_pusb <- fread("psam_pusb.csv")


# reading in a list of PUMAs that are in NYC
setwd(nyc_puma_location)
nyc_puma <- read.csv("nycPUMAList.csv",  stringsAsFactors = FALSE)


# creating merged dataframe from files ------------------------------------

# append the two sets of files together
psam_hus <- rbind(psam_husa, psam_husb)
psam_pus <- rbind(psam_pusa, psam_pusb)


# clearing the variables to save memory
psam_husa <- NULL
psam_husb <- NULL
psam_pusa <- NULL
psam_pusb <- NULL

# merging person into the households via serial number
acs2017 <- NULL
acs2017 <- merge(psam_pus, psam_hus, by.x = "SERIALNO", by.y="SERIALNO", all.x = TRUE)
dim(acs2017)

# checking percentages
table(acs2017$NATIVITY)
table(acs2017$CIT)

# cleearing memory
psam_pus <- NULL
psam_hus <- NULL


# filtering dataframe and identifying fee waiver eligible -----------------


# filtering to just people who are similar to those in NaturalizeNY program, foreign born non-citizens

# foregin born, nativity == 2
table(acs2017$NATIVITY)
acs2017 <- filter(acs2017, NATIVITY == 2)

# non-citizens, cit == 5
table(acs2017$CIT)
acs2017 <- filter(acs2017, CIT == 5)

# remaining are the foreign born, non-citizens in the ACS 2017
dim(acs2017)
#174817

# preparing the ACS data for merging in NYC codes
# some of the variables were in the household and person files and were duplciated
acs2017$PUMA <-acs2017$PUMA.x
acs2017$PUMA.y <- NULL
acs2017$PUMA.x <- NULL

acs2017$ST <- acs2017$ST.x
acs2017$ST.y <- NULL
acs2017$ST.x <- NULL


# creating a flag for new york state
# NY has a FIPS code of 36
acs2017$ny_state <- 0
acs2017$ny_state[acs2017$ST == 36] <- 1
table(acs2017$ny_state)

# creating a flag for NYC
acs2017$nyc <- 0
acs2017$nyc[acs2017$ST == 36 & acs2017$PUMA %in% nyc_puma$puma] <- 1
table(acs2017$nyc)


# eliminating those that arrive before 2012, would not generally be eligible to immigrate
acs2017 <- filter(acs2017, YOEP <= 2012)
dim(acs2017)
# 126,941

# household size
# the ACS includes NP (number of people in household) and NPF (number of people in family)
# we use NPF to determine the family size which is more appropriate for determining fee waiver eligibility
# setting single family homes to one family size
acs2017$NPF[is.na(acs2017$NPF)] <- 1
acs2017$hhsize <- acs2017$NPF

# household income
# the ACS has multiple measures of income, such as PINCP (personal), HINCP (household), and FINCP (family)
# we use FINCP to determine the household income


# coding household income for people that have single person home
acs2017 %>% filter(hhsize==1) %>% tab(FINCP)
acs2017$FINCP[is.na(acs2017$FINCP)] <- acs2017$PINCP[is.na(acs2017$FINCP)] 
acs2017 %>% filter(hhsize==1) %>% tab(FINCP)


# keeping only family members
# RELP is a relationship code. RELP 11-17 are non-family members
acs2017 %>% tab(RELP)
acs2017 <- filter(acs2017, RELP < 10)
dim(acs2017)
# 109,113

# dropping people under the age of 18, don't qualify for the program
acs2017 %>% tab(AGEP)
acs2017 <- acs2017 %>% filter(AGEP >= 18)
dim(acs2017)

# someone is eligible for a fee waiver if their family income is below 150% of the Federal Poverty Guidelines or if they receive means-tested benefits

# coding household income below 150
acs2017$income150 <- NA
acs2017$income150 <- 0

# these codings rely on the Federal Poverty Guidelines for 2018
# coding family income below 150
acs2017$income150[acs2017$FINCP <= 18090 & acs2017$NPF == 1] <- 1
acs2017$income150[acs2017$FINCP <= 24360 & acs2017$NPF == 2] <- 1
acs2017$income150[acs2017$FINCP <= 30630 & acs2017$NPF == 3] <- 1
acs2017$income150[acs2017$FINCP <= 36900 & acs2017$NPF == 4] <- 1
acs2017$income150[acs2017$FINCP <= 43170 & acs2017$NPF == 5] <- 1
acs2017$income150[acs2017$FINCP <= 49440 & acs2017$NPF == 6] <- 1
acs2017$income150[acs2017$FINCP <= 55710 & acs2017$NPF == 7] <- 1
acs2017$income150[acs2017$FINCP <= 61980 & acs2017$NPF == 8] <- 1
acs2017$income150[acs2017$FINCP <= (61980 + 6270) & acs2017$NPF == 9] <- 1
acs2017$income150[acs2017$FINCP <= (61980 + 6270*2) & acs2017$NPF == 10] <- 1
acs2017$income150[acs2017$FINCP <= (61980 + 6270*3) & acs2017$NPF == 11] <- 1
acs2017$income150[acs2017$FINCP <= (61980 + 6270*4) & acs2017$NPF == 12] <- 1
acs2017$income150[acs2017$FINCP <= (61980 + 6270*5) & acs2017$NPF == 13] <- 1
acs2017$income150[acs2017$FINCP <= (61980 + 6270*6) & acs2017$NPF == 14] <- 1
acs2017$income150[acs2017$FINCP <= (61980 + 6270*7) & acs2017$NPF == 15] <- 1
acs2017$income150[acs2017$FINCP <= (61980 + 6270*8) & acs2017$NPF == 16] <- 1
acs2017$income150[acs2017$FINCP <= (61980 + 6270*9) & acs2017$NPF == 17] <- 1
acs2017$income150[acs2017$FINCP <= (61980 + 6270*10) & acs2017$NPF == 18] <- 1
table(acs2017$income150)


# coding means tested benefits

# HINS4 is the ACS variable for medicaid coverage
acs2017$medicaid <- 0
acs2017$medicaid[acs2017$HINS4 == 1] <- 1

# FS is the variable for food stamps (SNAP)
acs2017$snap <- 0
acs2017$snap[acs2017$FS == 1] <- 1

# SSIP is the ACS variable for Supplementary Security Income past 12 months
acs2017$ssi <- 0
acs2017$ssi[acs2017$SSIP > 0] <- 1

# PAP is the ACS variable for public assistance benefits in the past 12 months (TANF)
acs2017$welfare <- 0
acs2017$welfare[acs2017$PAP > 0] <- 1


# means tested benefits
acs2017$meanstested <- 0
acs2017$meanstested[acs2017$medicaid == 1] <- 1
acs2017$meanstested[acs2017$snap == 1] <- 1
acs2017$meanstested[acs2017$ssi == 1] <- 1
acs2017$meanstested[acs2017$welfare == 1] <- 1
acs2017 %>% tab(meanstested)

# creating fee waiver eligible
acs2017$fee_waiver_eligible <- 0
acs2017$fee_waiver_eligible[acs2017$income150 == 1 | acs2017$meanstested == 1] <- 1
acs2017 %>% tab(fee_waiver_eligible)



# outputting the data -----------------------------------------------------


# outputting the prepared ACS data file
setwd(output_location)
write.csv(acs2017, "acs2017Prepped.csv", row.names = FALSE)





